0.1 0. Install Packages

pkgs <- c("jiebaR", "tidyverse", "tidytext", "stringr", "e1071", "tidyr", "Rtsne")
pkgs <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(pkgs)) { install.packages(pkgs)}
library(tidyverse)
library(stringr)
options(stringsAsFactors = F)

1 Loading data

library(jiebaR)
segment_not <- c("鴻海" ,  "永豐金", "中信金", "台積電", "聯發科" ,"兆豐金", "台指期","郭台銘","張忠謀","鉅亨網")
cutter <- worker()
new_user_word(cutter,segment_not)
stopWords <- readRDS("data/stopWords.rds")

2 Stopwords


unnested.df <- stock_news %>%
    select(doc_id = newsId, text = content, status = status_p) %>%
    mutate(word = purrr::map(text, function(x)segment(x, cutter))) %>%
    unnest(word) %>%
    filter(!is.na(word)) %>% 
    filter(!word %in% stopWords$word) %>%
    filter(!str_detect(word, "[a-zA-Z0-9]+")) %>%
    filter(nchar(word) > 1)
unnested.df %>%
    count(doc_id, word) %>%
    spread(word, n, fill = 0) %>% dim
[1]   610 12936

3 4. Chi-square feature selection

chi_df <- unnested.df %>%
    count(word, status) %>%
    filter(n > 3) %>%
    spread(status, n, fill = 0) %>%
    rename(A=`1`, C=`0`) %>%
    mutate(B=sum(A)-A,
           D=sum(C)-C,
           N=A+B+C+D, 
           chi2 = (A*D - B*C)^2 * N / ((A+C)*(A+B)*(B+D)*(C+D))) %>%
    filter(chi2 > 6.64)

4 5. Counting doc term frequency after feature selection

doc_term_count <- unnested.df %>%
    left_join(chi_df) %>%
    filter(!is.na(chi2)) %>%
    count(doc_id, word)
Joining, by = "word"
doc_term_count %>%
    spread(word, n, fill = 0) %>%
    dim
[1] 609 546

5 6. TF-IDF(term frequency & inverse document frequency)

# install.packages("tidytext")
# dtm <- cast_dtm(word_token, title, words, n)
# ??cast_dtm
comb.df <- doc_term_count %>%
    tidytext::bind_tf_idf(word, doc_id, n) %>%
    select(doc_id, word, tf_idf) %>%
    spread(word, tf_idf, fill=0) %>%
    left_join(select(stock_news, doc_id = newsId, status = status_p)) %>%
    select(doc_id, status, everything())
Joining, by = "doc_id"

6 7. T-SNE

library(Rtsne) # cannot be installed in MacOS mojave
tsne <- comb.df %>% select(-doc_id, -status) %>%
    Rtsne(perplexity = 35, dims = 2, check_duplicates = F)
# 取出降維後的特徵值df
feature_tsne <- comb.df %>%
    select(doc_id, status) %>%
    mutate(status = as.factor(status)) %>%
    bind_cols(as.data.frame(tsne$Y)) %>%
    mutate(id = row_number())

7 plotting tsne results

feature_tsne %>%
    ggplot() + aes(V1, V2, color = status) + 
    geom_point()

8 8. divide to tranining and testing set

set.seed(2017)

samples <- sample(1:nrow(feature_tsne), 
                  size = round(nrow(feature_tsne)*0.6))

trainset <- feature_tsne %>% select(-doc_id) %>% slice(samples)
testset <- feature_tsne[-samples,-1]

9 9. SVM

library(e1071)
model <- svm(status~ ., data = trainset, kernel="radial")
plot(model, trainset, V1~V2)

predicting  <- predict(model, testset %>% select(-status))
# creating confusion matrix
# https://en.wikipedia.org/wiki/Confusion_matrix
table(predicting, testset$status)
          
predicting   0   1
         0 126  61
         1  30  27
# accuracy
pre <- predicting == testset$status
percent1 <- length(pre[pre == T]) / length(pre)
percent1
[1] 0.6270492
LS0tCnRpdGxlOiAiU1ZNICYgU3RvY2sgUHJpY2UgUHJlZGljdGlvbiIKYXV0aG9yOiAiSmlsdW5nIEhzaWVoIgpkYXRlOiAiMjAxOC83LzMiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazogCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIGhpZ2hsaWdodDogdGV4dG1hdGUKICAgIHRoZW1lOiBzcGFjZWxhYgogICAgdG9jOiB5ZXMKZWRpdG9yX29wdGlvbnM6IAogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgoKIyMgMC4gSW5zdGFsbCBQYWNrYWdlcwoKCmBgYHtyIHByZS1pbnN0YWxsfQpwa2dzIDwtIGMoImppZWJhUiIsICJ0aWR5dmVyc2UiLCAidGlkeXRleHQiLCAic3RyaW5nciIsICJlMTA3MSIsICJ0aWR5ciIsICJSdHNuZSIpCnBrZ3MgPC0gcGtnc1shcGtncyAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygpWywiUGFja2FnZSJdXQppZihsZW5ndGgocGtncykpIHsgaW5zdGFsbC5wYWNrYWdlcyhwa2dzKX0KCmBgYAoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShzdHJpbmdyKQpvcHRpb25zKHN0cmluZ3NBc0ZhY3RvcnMgPSBGKQpgYGAKCiMgTG9hZGluZyBkYXRhCmBgYHtyfQpsb2FkKCJkYXRhL3N0b2NrX25ld3MuUkRhdGEiKQpzdG9ja19uZXdzICU+JSBuYW1lcwpzdG9ja19uZXdzICU+JSBzZWxlY3QobmV3c0lkLCB3b3JkcywgdGltZSwgY29kZSwgc3RhdHVzX3AsIHN0YXR1c192LCBldmVyeXRoaW5nKCkpICU+JSBWaWV3CmBgYAoKCmBgYHtyIGplaWJhUiBhbmQgc3RvcCB3b3JkfQpsaWJyYXJ5KGppZWJhUikKc2VnbWVudF9ub3QgPC0gYygi6bS75rW3IiAsICAi5rC46LGQ6YeRIiwgIuS4reS/oemHkSIsICLlj7DnqY3pm7siLCAi6IGv55m856eRIiAsIuWFhuixkOmHkSIsICLlj7DmjIfmnJ8iLCLpg63lj7DpipgiLCLlvLXlv6DorIAiLCLpiYXkuqjntrIiKQpjdXR0ZXIgPC0gd29ya2VyKCkKbmV3X3VzZXJfd29yZChjdXR0ZXIsc2VnbWVudF9ub3QpCnN0b3BXb3JkcyA8LSByZWFkUkRTKCJkYXRhL3N0b3BXb3Jkcy5yZHMiKQpgYGAKCgoKCiMgU3RvcHdvcmRzCgoKYGBge3J9Cgp1bm5lc3RlZC5kZiA8LSBzdG9ja19uZXdzICU+JQogICAgc2VsZWN0KGRvY19pZCA9IG5ld3NJZCwgdGV4dCA9IGNvbnRlbnQsIHN0YXR1cyA9IHN0YXR1c19wKSAlPiUKICAgIG11dGF0ZSh3b3JkID0gcHVycnI6Om1hcCh0ZXh0LCBmdW5jdGlvbih4KXNlZ21lbnQoeCwgY3V0dGVyKSkpICU+JQogICAgdW5uZXN0KHdvcmQpICU+JQogICAgZmlsdGVyKCFpcy5uYSh3b3JkKSkgJT4lIAogICAgZmlsdGVyKCF3b3JkICVpbiUgc3RvcFdvcmRzJHdvcmQpICU+JQogICAgZmlsdGVyKCFzdHJfZGV0ZWN0KHdvcmQsICJbYS16QS1aMC05XSsiKSkgJT4lCiAgICBmaWx0ZXIobmNoYXIod29yZCkgPiAxKQpgYGAKCgoqIG9yaWdpbmFsIGRpbWVuc2lvbjogCj4gNjEwIG5ld3MgeCAxMiw5MzYgd29yZHMKCmBgYHtyfQp1bm5lc3RlZC5kZiAlPiUKICAgIGNvdW50KGRvY19pZCwgd29yZCkgJT4lCiAgICBzcHJlYWQod29yZCwgbiwgZmlsbCA9IDApICU+JSBkaW0Kc2VsZWN0KDE6MjApICU+JSBoZWFkKDEwMCkgJT4lIFZpZXcKYGBgCgoKCiMgNC4gQ2hpLXNxdWFyZSBmZWF0dXJlIHNlbGVjdGlvbgoKYGBge3J9CmNoaV9kZiA8LSB1bm5lc3RlZC5kZiAlPiUKICAgIGNvdW50KHdvcmQsIHN0YXR1cykgJT4lCiAgICBmaWx0ZXIobiA+IDMpICU+JQogICAgc3ByZWFkKHN0YXR1cywgbiwgZmlsbCA9IDApICU+JQogICAgcmVuYW1lKEE9YDFgLCBDPWAwYCkgJT4lCiAgICBtdXRhdGUoQj1zdW0oQSktQSwKICAgICAgICAgICBEPXN1bShDKS1DLAogICAgICAgICAgIE49QStCK0MrRCwgCiAgICAgICAgICAgY2hpMiA9IChBKkQgLSBCKkMpXjIgKiBOIC8gKChBK0MpKihBK0IpKihCK0QpKihDK0QpKSkgJT4lCiAgICBmaWx0ZXIoY2hpMiA+IDYuNjQpCmBgYAoKCiMgNS4gQ291bnRpbmcgZG9jIHRlcm0gZnJlcXVlbmN5IGFmdGVyIGZlYXR1cmUgc2VsZWN0aW9uCmBgYHtyfQpkb2NfdGVybV9jb3VudCA8LSB1bm5lc3RlZC5kZiAlPiUKICAgIGxlZnRfam9pbihjaGlfZGYpICU+JQogICAgZmlsdGVyKCFpcy5uYShjaGkyKSkgJT4lCiAgICBjb3VudChkb2NfaWQsIHdvcmQpCmBgYAoKYGBge3J9CmRvY190ZXJtX2NvdW50ICU+JQogICAgc3ByZWFkKHdvcmQsIG4sIGZpbGwgPSAwKSAlPiUKICAgIGRpbQpgYGAKCgoKCgojIDYuIFRGLUlERu+8iHRlcm0gZnJlcXVlbmN5ICYgaW52ZXJzZSBkb2N1bWVudCBmcmVxdWVuY3nvvIkKCmBgYHtyfQpsaWJyYXJ5KHRpZHl0ZXh0KQojIGluc3RhbGwucGFja2FnZXMoInRpZHl0ZXh0IikKIyBkdG0gPC0gY2FzdF9kdG0od29yZF90b2tlbiwgdGl0bGUsIHdvcmRzLCBuKQojID8/Y2FzdF9kdG0KY29tYi5kZiA8LSBkb2NfdGVybV9jb3VudCAlPiUKICAgIHRpZHl0ZXh0OjpiaW5kX3RmX2lkZih3b3JkLCBkb2NfaWQsIG4pICU+JQogICAgc2VsZWN0KGRvY19pZCwgd29yZCwgdGZfaWRmKSAlPiUKICAgIHNwcmVhZCh3b3JkLCB0Zl9pZGYsIGZpbGw9MCkgJT4lCiAgICBsZWZ0X2pvaW4oc2VsZWN0KHN0b2NrX25ld3MsIGRvY19pZCA9IG5ld3NJZCwgc3RhdHVzID0gc3RhdHVzX3ApKSAlPiUKICAgIHNlbGVjdChkb2NfaWQsIHN0YXR1cywgZXZlcnl0aGluZygpKQpgYGAKCgoKCiMgNy4gVC1TTkUKCgpgYGB7cn0KbGlicmFyeShSdHNuZSkgCgp0c25lIDwtIGNvbWIuZGYgJT4lIHNlbGVjdCgtZG9jX2lkLCAtc3RhdHVzKSAlPiUKICAgIFJ0c25lKHBlcnBsZXhpdHkgPSAzNSwgZGltcyA9IDIsIGNoZWNrX2R1cGxpY2F0ZXMgPSBGKQoKZmVhdHVyZV90c25lIDwtIGNvbWIuZGYgJT4lCiAgICBzZWxlY3QoZG9jX2lkLCBzdGF0dXMpICU+JQogICAgbXV0YXRlKHN0YXR1cyA9IGFzLmZhY3RvcihzdGF0dXMpKSAlPiUKICAgIGJpbmRfY29scyhhcy5kYXRhLmZyYW1lKHRzbmUkWSkpICU+JQogICAgbXV0YXRlKGlkID0gcm93X251bWJlcigpKQpgYGAKCiMgcGxvdHRpbmcgdHNuZSByZXN1bHRzCgpgYGB7cn0KZmVhdHVyZV90c25lICU+JQogICAgZ2dwbG90KCkgKyBhZXMoVjEsIFYyLCBjb2xvciA9IHN0YXR1cykgKyAKICAgIGdlb21fcG9pbnQoKQpgYGAKCgoKCgojIDguIGRpdmlkZSB0byB0cmFuaW5pbmcgYW5kIHRlc3Rpbmcgc2V0CmBgYHtyfQpzZXQuc2VlZCgyMDE3KQoKc2FtcGxlcyA8LSBzYW1wbGUoMTpucm93KGZlYXR1cmVfdHNuZSksIAogICAgICAgICAgICAgICAgICBzaXplID0gcm91bmQobnJvdyhmZWF0dXJlX3RzbmUpKjAuNikpCgp0cmFpbnNldCA8LSBmZWF0dXJlX3RzbmUgJT4lIHNlbGVjdCgtZG9jX2lkKSAlPiUgc2xpY2Uoc2FtcGxlcykKdGVzdHNldCA8LSBmZWF0dXJlX3RzbmVbLXNhbXBsZXMsLTFdCgpgYGAKCgoKIyA5LiBTVk0KYGBge3J9CgpsaWJyYXJ5KGUxMDcxKQptb2RlbCA8LSBzdm0oc3RhdHVzfiAuLCBkYXRhID0gdHJhaW5zZXQsIGtlcm5lbD0icmFkaWFsIikKcGxvdChtb2RlbCwgdHJhaW5zZXQsIFYxflYyKQoKcHJlZGljdGluZyAgPC0gcHJlZGljdChtb2RlbCwgdGVzdHNldCAlPiUgc2VsZWN0KC1zdGF0dXMpKQoKIyBjcmVhdGluZyBjb25mdXNpb24gbWF0cml4CiMgaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvQ29uZnVzaW9uX21hdHJpeAp0YWJsZShwcmVkaWN0aW5nLCB0ZXN0c2V0JHN0YXR1cykKCiMgYWNjdXJhY3kKcHJlIDwtIHByZWRpY3RpbmcgPT0gdGVzdHNldCRzdGF0dXMKcGVyY2VudDEgPC0gbGVuZ3RoKHByZVtwcmUgPT0gVF0pIC8gbGVuZ3RoKHByZSkKcGVyY2VudDEKYGBgCg==